Set-up packages // libraries and prepare for data import:
library(psych)
library(likert)
library(tidyverse)
library(jsonlite)
library(ggplot2)
library(mosaic)
source("http://pcwww.liv.ac.uk/~william/R/crosstab.r")
theme_set(theme_classic())
Set-up directories, import and clean data:
rm(list=ls())
setwd("/Users/allieblaising/desktop/bang/R")
getwd()
[1] "/Users/allieblaising/Desktop/bang/R"
dataPath = "../.data"
## Define function to extract survey results:
extractSurvey = function(frame,survey) {
rounds = seq(1,length(frame$results.format[[1]]))
roundResponses = lapply(rounds, function(round) {
getCol = paste("results.",survey,".",round, sep="")
surveyCols = Filter(function(x) grepl(getCol,x),names(frame))
newCols = lapply(surveyCols, function(x) gsub(getCol,paste("results.",survey, sep=""),x) )
surveyFrame = frame[,surveyCols]
if (is.null(newCols)) {return("No newCols")}
names(surveyFrame) = newCols
surveyFrame$id = frame$id
surveyFrame$round = round
surveyFrame$batch = frame$batch
surveyFrame$rooms = frame$rooms
surveyFrame$manipulation = frame$results.manipulationCheck
surveyFrame$blacklist = frame$results.blacklistCheck
return(surveyFrame)
})
return(Reduce(rbind,roundResponses))
}
#Find directory for import (be sure to verify that batch #s align from bangData import and imports below):
batches = dir(dataPath, pattern = "^[0-9]+$" )
completeBatches = Filter(function(batch) {
if (any(dir(paste(dataPath,batch,sep="/")) == "batch.json") && (any(dir(paste(dataPath,batch,sep="/")) == "users.json")) ) {
batchData = read_json(paste(dataPath,batch,"batch.json",sep="/"), simplifyVector = TRUE)
return(any(batchData$batchComplete == TRUE))
}
return(FALSE)
}, batches)
userFiles = lapply(completeBatches, function(batch) {
userFile = read_json(paste(dataPath,batch,"users.json",sep="/"), simplifyVector = TRUE)
return(flatten(userFile, recursive = TRUE))
})
## Retroactively find rooms from chat data:
overlappingFiles = Reduce(function(x,y) merge(x, y, all=TRUE), userFiles)
roundsWithRooms = apply(overlappingFiles,1,function(x) {
roomsForIndividual = lapply(seq(1,3),function(y) {
x$room = x$rooms[y]
x$round = y
return(x)
})
return(Reduce(rbind,roomsForIndividual))
})
More cleaning before visualizations:
## Apply extract survey function to extract the right columns and rows for viability survey:
survey = 'viabilityCheck'
frame <- extractSurvey(overlappingFiles, survey)
## Reduce to vertically combine rows in roundwithRooms list:
finalRounds = as.data.frame(Reduce(rbind,roundsWithRooms))
## Subset incomplete cases from viability survey dataframe:
## MSB: complete.cases doesn't work when there aren't NAs, but empty lists, alternatives rather than subsetting where
## blacklist is empty? Doens't work: test <- complete.cases(data)
data <- frame[frame$manipulation!="",]
## Rename room to rooms so that both are retained in future merge:
data <- rename(data, rooms = "rooms")
## Subset incomplete cases for final rounds dataframe:
data2 <- finalRounds[finalRounds$results.manipulationCheck!="", ]
## Select only variables of interest from final rounds:
data2 = data2 %>% select(id, batch, room, bonus, name, friends,
friends_history, results.condition, results.format,
results.manipulation,results.manipulationCheck,results.blacklistCheck, round)
## Convert to compatible data types before merge (this should be simplified)
data2$batch <- unlist(data2$batch)
data$batch <- unlist(data$batch)
data2$round <- unlist(data2$round)
data2$id <- unlist(data2$id)
## Before merge, data and data2 should ahve the same # of observations
## Merge columns by id, round and batch #s:
data <- left_join(data, data2, by=NULL)
Joining, by = c("id", "round", "batch")
## Subset only observations with batch #s in complete batches
allConditions <- data[data$batch %in% completeBatches, ]
Conditionally assign conditions based on treatment and results column:
## Messy, but robust? Verify, verify, verify.
data <- data %>% mutate(
condition = case_when(
results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==1 ~ "A",
results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==2 ~ "B",
results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==3 ~ "Ap",
results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==1 ~ "A",
results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==2 ~ "Ap",
results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==3 ~ "B",
results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==1 ~ "B",
results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==2 ~ "A",
results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==3 ~ "Ap" ,
results.condition=='control' & results.format=="c(1, 2, 1)" & round==1 ~ "A",
results.condition=='control' & results.format=="c(1, 2, 1)" & round==2 ~ "B",
results.condition=='control' & results.format=="c(1, 2, 1)" & round==3 ~ "Ap",
results.condition=='control' & results.format=="c(1, 1, 2)" & round==1 ~ "A",
results.condition=='control' & results.format=="c(1, 1, 2)" & round==2 ~ "Ap",
results.condition=='control' & results.format=="c(1, 1, 2)" & round==3 ~ "B",
results.condition=='control' & results.format=="c(2, 1, 1)" & round==1 ~ "B",
results.condition=='control' & results.format=="c(2, 1, 1)" & round==2 ~ "A",
results.condition=='control' & results.format=="c(2, 1, 1)" & round==3 ~ "Ap" ,
results.condition=='baseline' & results.format=="c(1, 2, 3)" & round==1 ~ "A" ,
results.condition=='baseline' & results.format=="c(1, 2, 3)" & round==2 ~ "B" ,
results.condition=='baseline' & results.format=="c(1, 2, 3)" & round==3 ~ "C"
))
Set-up for factors for viability questions:
data <- rename(data, "repeatTeam" = results.viabilityCheck.15)
## Remove observations where viability survey wasn't on (remove this line if we want to keep observations with viability off):
data <- na.omit(data)
levels <- c("Strongly Disagree", "Disagree", "Neutral","Agree", "Strongly Agree")
clean <- data %>%
mutate_at(.vars = vars(contains("results.viabilityCheck")), funs(factor(., levels = levels)))
## Viabilitylikert visuals:
## Subset only viabilitySurvey columns and condition column (condition column used to visualize likert responses for conditions)
viabilityLikert <- select(clean, contains("results.viabilityCheck"), "condition", "results.condition")
viabilityLabels = c("1. The members of this team could work for a long time together"
, "2. Most of the members of this team would welcome the opportunity to work as a group again in the future." ,
"3. This team has the capacity for long-term success.",
"4. This team has what it takes to be effective in the future.",
"5. This team would work well together in the future." ,
" 6. This team has positioned itself well for continued success.",
" 7. This team has the ability to perform well in the future. ",
" 8. This team has the ability to function as an ongoing unit." ,
" 9. This team should continue to function as a unit. ",
" 10. This team has the resources to perform well in the future. ",
" 11. This team is well positioned for growth over time. ",
" 12. This team can develop to meet future challenges. ",
" 13. This team has the capacity to sustain itself. ",
" 14. This team has what it takes to endure in future performance episodes.", "condition", "results.condition")
names(viabilityLikert) <- rep(viabilityLabels)
Likert visualizations:
## Likert graph for all viability responses across all conditions (i.e. baseline, control and treatment):
likert.out <- likert(viabilityLikert[-c(15:16)])
plot(likert.out)

## Subset A and Ap (i.e. A prime) responses for treatment group:
# Treatment + A:
treatmentA <- viabilityLikert %>% filter(condition=="A" & results.condition=="treatment")
likert.treatmentA <- likert(treatmentA[-c(15:16)])
plot(likert.treatmentA)

# Treatment + Ap:
treatmentAp <- viabilityLikert %>% filter(condition=="Ap" & results.condition=="treatment")
likert.treatmentAp <- likert(treatmentAp[-c(15:16)])
plot(likert.treatmentAp)

# Treatment + B:
treatmentB <- viabilityLikert %>% filter(condition=="B" & results.condition=="treatment")
likert.treatmentB <- likert(treatmentB[-c(15:16)])
plot(likert.treatmentAp)

## Customize code above to filter on other conditions and treatments of interest.
Exploratory data analysis part #1:
## Create a new dataframe that converts factors to numeric for statistical analyses:
stats <- clean %>% mutate_if(is.factor, as.numeric)
for (i in 1:nrow(stats)) {
stats$sum[i] <- sum(stats[i,1:14])
}
stats$median <- median(stats$sum)
stats$mean <- mean(stats$sum)
## Revalue repeat team: keep plyr b/c some weird R stuff requires library to be called directly (recode values to )
stats$repeatTeam <- plyr::revalue(stats$repeatTeam, c("Yes"="0", "No"="1"))
stats$repeatTeam <- plyr::revalue(stats$repeatTeam, c("Keep this team"="0", "Do not keep this team"="1"))
## Convert to compatible classes for team grouping:
stats$repeatTeam <- as.numeric(stats$repeatTeam)
stats$results.condition <- unlist(stats$results.condition)
stats$results.format <- as.character(stats$results.format)
stats$room <- unlist(stats$room)
## Dplyr to group teams and summarise variables for each group: group_by to find teams and summarise to compact individual results into group level results (i.e. one row of variable results per team)
groupedProportion <- stats %>%
group_by(room, batch, round, condition, results.condition, results.format) %>%
summarise(n=n(), mean=mean(sum), median=median(sum),prop=sum(repeatTeam)/n) %>%
## Filter out all teams with n=1 (i.e. a person in a chat room alone)
filter(n>1)
## Individual proportion:
individualProportion <- stats %>% group_by(round, batch, room) %>%
mutate(sum=sum, mean=mean(sum), median=median(sum), n=n(),prop=sum(repeatTeam)/n) %>% filter(n>1)
## Table showing how many teams we've run in each condition combination:
table(groupedProportion$condition, groupedProportion$results.condition)
control treatment
A 18 37
Ap 18 37
B 19 35
Exploratory data analysis part #2:
## Plot of prop and mean per team with standard error:
ggplot(groupedProportion, aes(x=prop, y=mean)) +
geom_point() +
stat_smooth(method = "lm", col = "red") +
labs(subtitle="",
x="Proportion of answers to Q15: ",
y="Numeric sum of viability measures questions (range: 7-70)") + facet_grid(condition ~ results.format)

## Unlist for condition for graph:
groupedProportion$results.condition <- unlist(groupedProportion$results.condition)
## Team viability sum and Q15 factor repeat team answers facet_grid by condition, results.condition and results.format (all users)
g <- ggplot(stats, aes(factor(repeatTeam), sum))
g + geom_boxplot(varwidth=T, fill="plum") +
labs(subtitle="Sum of viability measures grouped by repeat team question:",
x="Distribution of responses to Q15: ",
y="Numeric sum of viability measures questions (range: 7-70)") + facet_grid(condition ~ results.format)

## Team viability mean and repeat team answers facet_grid by results.condition, condition and results.format
g <- ggplot(stats, aes(factor(repeatTeam), mean))
g + geom_boxplot(varwidth=T, fill="plum") +
labs(subtitle="Individual mean of viability measures grouped fracture measure score across all conditions",
x="Distribution of responses to Q15",
y="Numeric sum of viability measures questions (range: 7-70)") + facet_grid(condition ~ results.format)

g <- ggplot(stats, aes(factor(repeatTeam), sum))
g + geom_boxplot(varwidth=T, fill="plum") +
labs(subtitle="Individual fracture score vs. mean viability:",
x="",
y="Numeric sum of viability measures questions (range: 7-70)") + facet_grid(condition ~ results.condition)

## groupedProportion: anlaysis on team level
ggplot(groupedProportion, aes(x=prop, y=mean)) +
geom_point() +
stat_smooth(method = "lm", col = "red") + labs(title="Team fracture proportion vs. team mean viability")

## Mean vs. median plot for viability sums per team:
ggplot(groupedProportion, aes(x=mean, y=median)) +
geom_point() +
stat_smooth(method = "lm", col = "red") + labs(main="Scatterplot of median vs. mean for team viability sums")

ggplot(data=groupedProportion, aes(groupedProportion$prop)) +
geom_histogram(breaks=seq(0, 1, by=0.20),
col="red",
fill="green",
alpha=.2) + labs(title="Count of team proportions of fracture value responses",
x="", y="Count") + facet_grid(condition ~ results.format)

Probability of fracture across teams and condition combinations:
tally(~fracture| condition, data = conditionalPropTreatment, format = "proportion")
condition
fracture A Ap
0 0.5405405 0.6756757
1 0.4594595 0.3243243
Boxplot for fracture scores vs. viability score sum (7-70)
groupedProportionFracture$fracture <- as.numeric(groupedProportionFracture$fracture)
g <- ggplot(groupedProportionFracture, aes(factor(fracture), median))
g + geom_boxplot(varwidth=T, fill="plum") +
labs(subtitle="Team median viability score in relation to binary fracture value",
x="Binary fracture value",
y="Numeric sum of viability measures questions (range: 7-70)") + scale_x_discrete(labels=c("0" = "No Fracture", "1" = "Fracture")) + facet_grid(condition ~ results.condition)

Fracture proportion scatterplots:
## LEARNING EFFECT
## Overall fracture change:
sum(ifelse(fractureLE1$fracture==fractureLE2$fracture,0,1))
longer object length is not a multiple of shorter object length
[1] 27
Absolute change in fracture proportions between conditions:
Histogram comparisons of fracture ratios across treatment, control and learning effect conditions
ggplot(data=groupedProportionFractureTreatment, aes(fracture)) +
geom_bar(breaks=seq(0, 1, by=0.20),
col="red",
fill="green",
alpha=.2) + labs(title="Fracture ratios: treatment condition") + facet_grid(condition ~ results.condition)
Ignoring unknown parameters: breaks

Graphs for mean distrbution per treatment condition:
Chat data:
g <- ggplot(chatFreqStats, aes(n)) + scale_fill_brewer(palette = "Spectral")
g + geom_histogram(aes(fill=factor(round)),
bins=5,
col="black",
size=.1) + # change number of bins
labs(title="Histogram of chat line length per team (includes teams with n=1)",
fill = "Round") +
xlab(label="Number of lines from chat data per team") +
ylab(label="Count") + facet_grid(fracture1~fracture2)
Error: At least one layer must contain all faceting variables: `fracture1`.
* Plot is missing `fracture1`
* Layer 1 is missing `fracture1`

Conditional probabilities:
# Print out the probabilities
round(c(p1, p2, p3, p4), digits = 2)
[1] 0.56 0.44 0.71 0.29
---
title: "R Notebook"
output: html_notebook
---

## Set-up packages // libraries and prepare for data import: 
```{r}
## Uncomment install.packages and run outside of notebook environment: 
## install.packages(c("psych" ,"xtable", "tidyverse", "jsonlite", "likert", "ggplot2", "ploty", "mosaic"))

library(psych)
library(likert)
library(tidyverse)
library(jsonlite)
library(ggplot2)
library(mosaic)
library(plotly)
source("http://pcwww.liv.ac.uk/~william/R/crosstab.r")
theme_set(theme_classic())
```

## Set-up directories, import and clean data: 
```{r}
rm(list=ls())
setwd("/Users/allieblaising/desktop/bang/R") 
getwd()
dataPath = "../.data"
## Define function to extract survey results: 
extractSurvey = function(frame,survey) {
  rounds = seq(1,length(frame$results.format[[1]]))
  roundResponses = lapply(rounds, function(round) {
    getCol = paste("results.",survey,".",round, sep="")
    surveyCols = Filter(function(x) grepl(getCol,x),names(frame))
    newCols = lapply(surveyCols, function(x) gsub(getCol,paste("results.",survey, sep=""),x) )
    surveyFrame = frame[,surveyCols]
    if (is.null(newCols)) {return("No newCols")}
    names(surveyFrame) = newCols
    surveyFrame$id = frame$id
    surveyFrame$round = round
    surveyFrame$batch = frame$batch
    surveyFrame$rooms = frame$rooms
    surveyFrame$manipulation = frame$results.manipulationCheck
    surveyFrame$blacklist = frame$results.blacklistCheck
    return(surveyFrame)
  })
  return(Reduce(rbind,roundResponses))
}
#Find directory for import (be sure to verify that batch #s align from bangData import and imports below): 
batches = dir(dataPath, pattern = "^[0-9]+$" )
completeBatches = Filter(function(batch) { 
  if (any(dir(paste(dataPath,batch,sep="/")) == "batch.json") && (any(dir(paste(dataPath,batch,sep="/")) == "users.json")) ) {
    batchData = read_json(paste(dataPath,batch,"batch.json",sep="/"), simplifyVector = TRUE)
    return(any(batchData$batchComplete == TRUE))
  } 
  return(FALSE)
}, batches)
userFiles = lapply(completeBatches, function(batch) {
  userFile = read_json(paste(dataPath,batch,"users.json",sep="/"), simplifyVector = TRUE)
  return(flatten(userFile, recursive = TRUE))
})
## Retroactively find rooms from chat data: 
overlappingFiles = Reduce(function(x,y) merge(x, y, all=TRUE), userFiles)
roundsWithRooms = apply(overlappingFiles,1,function(x) {
  roomsForIndividual = lapply(seq(1,3),function(y) {
    x$room = x$rooms[y]
    x$round = y
    return(x)
  })
  return(Reduce(rbind,roomsForIndividual))
})
```

## More cleaning before visualizations: 
```{r}
## Apply extract survey function to extract the right columns and rows for viability survey: 
survey = 'viabilityCheck'
frame <- extractSurvey(overlappingFiles, survey)
## Reduce to vertically combine rows in roundwithRooms list:  
finalRounds = as.data.frame(Reduce(rbind,roundsWithRooms))
## Subset incomplete cases from viability survey dataframe: 
## MSB: complete.cases doesn't work when there aren't NAs, but empty lists, alternatives rather than subsetting where 
## blacklist is empty? Doens't work: test <- complete.cases(data)
data <- frame[frame$manipulation!="",]
## Rename room to rooms so that both are retained in future merge: 
data <- rename(data, rooms = "rooms")
## Subset incomplete cases for final rounds dataframe: 
data2 <- finalRounds[finalRounds$results.manipulationCheck!="", ]
## Select only variables of interest from final rounds: 
data2 = data2 %>% select(id, batch, room, bonus, name, friends, 
                         friends_history, results.condition, results.format,
                         results.manipulation,results.manipulationCheck,results.blacklistCheck, round)
## Convert to compatible data types before merge (this should be simplified)
data2$batch <- unlist(data2$batch)
data$batch <- unlist(data$batch)
data2$round <- unlist(data2$round)
data2$id <- unlist(data2$id)
## Before merge, data and data2 should ahve the same # of observations
## Merge columns by id, round and batch #s: 
data <- left_join(data, data2, by=NULL)
## Subset only observations with batch #s in complete batches 
allConditions <- data[data$batch %in% completeBatches, ]
```
## Conditionally assign conditions based on treatment and results column: 
```{r}
## Messy, but robust? Verify, verify, verify.  
data <- data %>% mutate(
  condition = case_when(
    results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==1 ~ "A", 
    results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==2 ~ "B", 
    results.condition=='treatment' & results.format=="c(1, 2, 1)" & round==3 ~ "Ap", 
    results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==1 ~ "A", 
    results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==2 ~ "Ap", 
    results.condition=='treatment' & results.format=="c(1, 1, 2)" & round==3 ~ "B", 
    results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==1 ~ "B", 
    results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==2 ~ "A", 
    results.condition=='treatment' & results.format=="c(2, 1, 1)" & round==3 ~ "Ap" ,
    results.condition=='control' & results.format=="c(1, 2, 1)" & round==1 ~ "A", 
    results.condition=='control' & results.format=="c(1, 2, 1)" & round==2 ~ "B", 
    results.condition=='control' & results.format=="c(1, 2, 1)" & round==3 ~ "Ap", 
    results.condition=='control' & results.format=="c(1, 1, 2)" & round==1 ~ "A", 
    results.condition=='control' & results.format=="c(1, 1, 2)" & round==2 ~ "Ap", 
    results.condition=='control' & results.format=="c(1, 1, 2)" & round==3 ~ "B", 
    results.condition=='control' & results.format=="c(2, 1, 1)" & round==1 ~ "B", 
    results.condition=='control' & results.format=="c(2, 1, 1)" & round==2 ~ "A", 
    results.condition=='control' & results.format=="c(2, 1, 1)" & round==3 ~ "Ap" , 
    results.condition=='baseline' & results.format=="c(1, 2, 3)" & round==1 ~ "A" ,
    results.condition=='baseline' & results.format=="c(1, 2, 3)" & round==2 ~ "B" ,
    results.condition=='baseline' & results.format=="c(1, 2, 3)" & round==3 ~ "C" 
  )) 
```

## Set-up for factors for viability questions: 
```{r}
data <- rename(data, "repeatTeam" = results.viabilityCheck.15)
## Remove observations where viability survey wasn't on (remove this line if we want to keep observations with viability off): 
data <- na.omit(data)
levels <- c("Strongly Disagree", "Disagree", "Neutral","Agree", "Strongly Agree") 
clean <- data %>% 
  mutate_at(.vars = vars(contains("results.viabilityCheck")), funs(factor(., levels = levels))) 
## Viabilitylikert visuals: 
## Subset only viabilitySurvey columns and condition column (condition column used to visualize likert responses for conditions)
viabilityLikert <- select(clean, contains("results.viabilityCheck"), "condition", "results.condition")
viabilityLabels = c("1. The members of this team could work for a long time together" 
                    , "2. Most of the members of this team would welcome the opportunity to work as a group again in the future." , 
                    "3. This team has the capacity for long-term success.", 
                    "4. This team has what it takes to be effective in the future.", 
                    "5. This team would work well together in the future." , 
                    " 6. This team has positioned itself well for continued success.",  
                    " 7. This team has the ability to perform well in the future. ", 
                    " 8. This team has the ability to function as an ongoing unit." , 
                    " 9. This team should continue to function as a unit. ", 
                    " 10. This team has the resources to perform well in the future. ", 
                    " 11. This team is well positioned for growth over time. ", 
                    " 12. This team can develop to meet future challenges. ", 
                    " 13. This team has the capacity to sustain itself. ", 
                    " 14. This team has what it takes to endure in future performance episodes.", "condition", "results.condition") 
names(viabilityLikert) <- rep(viabilityLabels) 
```

## Likert visualizations: 
```{r}
## Likert graph for all viability responses across all conditions (i.e. baseline, control and treatment): 
likert.out <- likert(viabilityLikert[-c(15:16)]) 
plot(likert.out)
## Subset A and Ap (i.e. A prime) responses for treatment group: 
# Treatment + A: 
treatmentA <- viabilityLikert %>% filter(condition=="A" & results.condition=="treatment") 
likert.treatmentA <- likert(treatmentA[-c(15:16)])
plot(likert.treatmentA)
# Treatment + Ap: 
treatmentAp <- viabilityLikert %>% filter(condition=="Ap" & results.condition=="treatment") 
likert.treatmentAp <- likert(treatmentAp[-c(15:16)])
plot(likert.treatmentAp)
# Treatment + B: 
treatmentB <- viabilityLikert %>% filter(condition=="B" & results.condition=="treatment") 
likert.treatmentB <- likert(treatmentB[-c(15:16)])
plot(likert.treatmentAp)
## Customize code above to filter on other conditions and treatments of interest. 
```

## Exploratory data analysis part #1: 
```{r}
## Create a new dataframe that converts factors to numeric for statistical analyses: 
stats <- clean %>% mutate_if(is.factor, as.numeric)
for (i in 1:nrow(stats)) {
  stats$sum[i] <- sum(stats[i,1:14])                          
} 
stats$median <- median(stats$sum)
stats$mean <- mean(stats$sum)
## Revalue repeat team: keep plyr b/c some weird R stuff requires library to be called directly (recode values to )
stats$repeatTeam <- plyr::revalue(stats$repeatTeam, c("Yes"="0", "No"="1"))
stats$repeatTeam <- plyr::revalue(stats$repeatTeam, c("Keep this team"="0", "Do not keep this team"="1"))
## Convert to compatible classes for team grouping: 
stats$repeatTeam <- as.numeric(stats$repeatTeam)
stats$results.condition <- unlist(stats$results.condition)
stats$results.format <- as.character(stats$results.format)
stats$room <- unlist(stats$room)
## Dplyr to group teams and summarise variables for each group: group_by to find teams and summarise to compact individual results into group level results (i.e. one row of variable results per team)
groupedProportion <- stats %>%
  group_by(room, batch, round, condition, results.condition, results.format) %>% 
  summarise(n=n(), mean=mean(sum), median=median(sum),prop=sum(repeatTeam)/n) %>% 
## Filter out all teams with n=1 (i.e. a person in a chat room alone)
  filter(n>1)
## Individual proportion: 
individualProportion <- stats %>% group_by(round, batch, room) %>% 
  mutate(sum=sum, mean=mean(sum), median=median(sum), n=n(),prop=sum(repeatTeam)/n) %>% filter(n>1)

## Table showing how many teams we've run in each condition combination:  
table(groupedProportion$condition, groupedProportion$results.condition)
```

## Exploratory data analysis part #2: 
```{r}
## Plot of prop and mean per team with standard error: 
ggplot(groupedProportion, aes(x=prop, y=mean)) + 
  geom_point() +
  stat_smooth(method = "lm", col = "red") + 
  labs(subtitle="", 
       x="Proportion of answers to Q15: ",
       y="Numeric sum of viability measures questions (range: 7-70)") + facet_grid(condition ~ results.format)

## Unlist for condition for graph: 
groupedProportion$results.condition <- unlist(groupedProportion$results.condition)
## Team viability sum and Q15 factor repeat team answers facet_grid by condition, results.condition and results.format (all users) 
g <- ggplot(stats, aes(factor(repeatTeam), sum))
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(subtitle="Sum of viability measures grouped by repeat team question:", 
       x="Distribution of responses to Q15: ",
       y="Numeric sum of viability measures questions (range: 7-70)") + facet_grid(condition ~ results.format)

g <- ggplot(stats, aes(factor(repeatTeam), sum))
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(subtitle="Individual fracture score vs. mean viability:", 
       x="",
       y="Numeric sum of viability measures questions (range: 7-70)") + facet_grid(condition ~ results.condition)

## groupedProportion: anlaysis on team level 

ggplot(groupedProportion, aes(x=prop, y=mean)) + 
  geom_point() +
  stat_smooth(method = "lm", col = "red") + labs(title="Team fracture proportion vs. team mean viability") 

## Mean vs. median plot for viability sums per team: 
ggplot(groupedProportion, aes(x=mean, y=median)) + 
  geom_point() +
  stat_smooth(method = "lm", col = "red") + labs(main="Scatterplot of median vs. mean for team viability sums")

ggplot(data=groupedProportion, aes(groupedProportion$prop)) + 
  geom_histogram(breaks=seq(0, 1, by=0.20), 
                 col="red", 
                 fill="green", 
                 alpha=.2) + labs(title="Count of team proportions of fracture value responses", 
                                  x="", y="Count") + facet_grid(condition ~ results.format)
```

## Probability of fracture across teams and condition combinations: 
```{r}
## Define and initialize cut-off point for fracture: 
groupedProportionFracture <- groupedProportion %>%
  group_by(results.condition, condition) %>% 
  mutate(fracture = case_when (prop<.50 ~ "0", 
                                prop>=.50~ "1"))

## Probability of fracture given condition in treatment: 
conditionalPropTreatment <- groupedProportionFractureTreatment %>% filter(condition=="A" | condition=="Ap")
tally(~fracture | condition, data = conditionalPropTreatment, format = "proportion")
```

## Boxplot for fracture scores vs. viability score sum (7-70)
```{r}
groupedProportionFracture$fracture <- as.numeric(groupedProportionFracture$fracture)
g <- ggplot(groupedProportionFracture, aes(factor(fracture), median))
g + geom_boxplot(varwidth=T, fill="plum") + 
  labs(subtitle="Team median viability score in relation to binary fracture value", 
       x="Binary fracture value",
       y="Numeric sum of viability measures questions (range: 7-70)") + scale_x_discrete(labels=c("0" = "No Fracture", "1" = "Fracture")) + facet_grid(condition ~ results.condition)

```


## Fracture proportion scatterplots: 
```{r}
## Learning effect // baseline: 
## Make new data that includes teams with different round 1 and round 3 teams: 
groupedProportionFractureLE <- groupedProportionFracture %>% filter(results.format=="c(1, 1, 2)" | results.format=="c(2, 1, 1)" & round=="1" | round=="3") 

fractureLE1 <- groupedProportionFractureLE %>% filter(round=="1")
fractureLE2 <- groupedProportionFractureLE %>% filter(round=="3")
fractureLE <- cbind(fractureLE1, fractureLE2)

## Treatment: 
groupedProportionFractureTreatment <- groupedProportionFracture %>% 
filter(results.condition=="treatment")
treatmentFracture1 <- groupedProportionFractureTreatment %>% filter(condition=="A")
treatmentFracture2 <- groupedProportionFractureTreatment %>% filter(condition=="Ap")
treatmentFracture <- cbind(treatmentFracture1, treatmentFracture2)

## Control:  
groupedProportionFractureControl <- groupedProportionFracture %>% 
  filter(results.condition=="control") 
controlFracture1 <- groupedProportionFractureControl %>% filter(condition=="A")
controlFracture2 <- groupedProportionFractureControl %>% filter(condition=="Ap")
controlFracture <- cbind(controlFracture1, controlFracture2)

ggplot(controlFracture, aes(prop, prop1)) +
  geom_point() +
  geom_jitter() + coord_fixed() + xlim(0.0,1.0) + ylim(0.0,1.0) + labs(title="Team fracture in the first vs. third in control", subtitle="No fracture=0, Fracture=1",  
                                                                       x="Fracture the first time a team interacts", 
                                                                       y="Fracture the second time a team interacts, without knowing it") 
## Defining fracture: 

## TREATMENT EFFECT: 
## Overall fracture change: 
sum(ifelse(treatmentFracture1$fracture==treatmentFracture2$fracture,0,1)) 
## No fracture to fracture: 
sum(ifelse(treatmentFracture1$fracture==0 & treatmentFracture2$fracture==1,1,0))
## Fracture to no fracture: 
sum(ifelse(treatmentFracture1$fracture==1 & treatmentFracture2$fracture==0,1,0))
## CONTROL EFFECT: 
## Overall fracture change: 
sum(ifelse(controlFracture1$fracture==controlFracture2$fracture,0,1))
## No fracture to fracture: 
sum(ifelse(controlFracture1$fracture==0 & controlFracture2$fracture==1,1,0))
## Fracture to no fracture: 
sum(ifelse(controlFracture1$fracture==1 & controlFracture2$fracture==0,1,0))
## LEARNING EFFECT 
## Overall fracture change: 
sum(ifelse(fractureLE1$fracture==fractureLE2$fracture,0,1)) 
## No fracture to fracture:
sum(ifelse(fractureLE1$fracture==0 & fractureLE2$fracture==1,1,0))
## Fracture to no fracture 
sum(ifelse(fractureLE1$fracture==1 & fractureLE2$fracture==0,1,0))
```
## Absolute change in fracture proportions between conditions: 
```{r}
fractureLE$abs <- abs(fractureLE2$prop-fractureLE1$prop)
g <- ggplot(fractureLE, aes(abs)) + scale_fill_brewer(palette = "Spectral")
g + geom_histogram(bins=7, 
                   col="pink", 
                   size=.4) + 
  labs(title="Absolute value of change in team fracture proportions in learning effect condition", 
       fill = "Round") + 
  xlab(label="Absolute value of change in team fracture proportions in learning effect condition") + 
  ylab(label="Count") 

treatmentFracture$abs <- abs(treatmentFracture2$prop-treatmentFracture1$prop)
g <- ggplot(treatmentFracture, aes(abs)) + scale_fill_brewer(palette = "Spectral")
g + geom_histogram(bins=7, 
                   col="pink", 
                   size=.4) + 
  labs(title="Absolute change in team fracture proportions in treatment condition", 
       fill = "Round") + 
  xlab(label="Absolute change in team fracture proportions in treatment condition") + 
  ylab(label="Count") 

controlFracture$abs <- abs(controlFracture2$prop-controlFracture1$prop)
g <- ggplot(controlFracture, aes(abs)) + scale_fill_brewer(palette = "Spectral")
g + geom_histogram(bins=7, 
                   col="pink", 
                   size=.4) + 
  labs(title="Absolute change in team fracture proportions in control condition", 
       fill = "Round") + 
  xlab(label="Absolute change in team fracture proportions in control condition") + 
  ylab(label="Count") 

treatmentFractureAbsSum <- sum(treatmentFracture$abs) 
controlFractureAbsSum <- sum(controlFracture$abs)

df1 <- data.frame(treatmentFractureAbsSum, controlFractureAbsSum) 

ggplot(df1, aes(x=controlFractureAbsSum, y=treatmentFractureAbsSum)) +
    geom_bar(stat="identity") 

g <- ggplot(test,aes(x, y)) + scale_fill_brewer(palette = "Spectral")
g + geom_bar() + 
  labs(title="Absolute change in team fracture proportions in control condition", 
       fill = "Round") + 
  xlab(label="Absolute change in team fracture proportions in control condition") + 
  ylab(label="Count") 

dat <- data.frame(conditionChanges = factor(c("Sum of absolute fracture change in treatment","Sum of absolute fracture change in control"), levels=c("Sum of absolute fracture change in treatment","Sum of absolute fracture change in control")),
    fractureProportionChange = c(13.083, 1.166))

p <- ggplot(data=dat, aes(x=conditionChanges, y=fractureProportionChange)) +
    geom_bar(stat="identity")
p <- ggplotly(p)
p
```

## Histogram comparisons of fracture ratios across treatment, control and learning effect conditions

```{r}
ggplot(data=groupedProportionFractureTreatment, aes(fracture)) + 
  geom_bar(breaks=seq(0, 1, by=0.20), 
                 col="red", 
                 fill="green", 
                 alpha=.2) + labs(title="Fracture ratios: treatment condition") + facet_grid(condition ~ results.condition)

ggplot(data=groupedProportionFractureControl, aes(fracture)) + 
  geom_bar(breaks=seq(0, 1, by=0.20), 
                 col="red", 
                 fill="green", 
                 alpha=.2) + labs(title="Fracture ratios: control round") + facet_grid(condition ~ results.condition)

ggplot(data=groupedProportionFractureLE, aes(fracture)) + 
  geom_bar(breaks=seq(0, 1, by=0.20), 
                 col="red", 
                 fill="green", 
                 alpha=.2) + labs(title="Fracture ratios: learning effect condition") + facet_grid(results.condition ~ round)

```

## Graphs for mean distrbution per treatment condition: 

```{r}
## Mean viability distribution graph: treatment condition, A group 
barfill <- "#4271AE"
barlines <- "#1F3552"
## Mean for treatment and A group: ##fill-in) 
meanDistributionTreatmentA <- ggplot(treatmentAStats, aes(x = sum)) +
  geom_histogram(aes(y = ..count..), binwidth = 5,
                 colour = barlines, fill = barfill) +
  scale_x_continuous(name = "Median viability sum",
                     breaks = seq(0, 100, 20),
                     limits=c(0, 70)) +
  scale_y_continuous(name = "count") +
  ggtitle("Frequency of sum of viability scores: N=##fill-in") +
  theme_bw() +
  theme(axis.line = element_line(size=1, colour = "black"),
        panel.grid.major = element_line(colour = "#d3d3d3"),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(), panel.background = element_blank(),
        plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
        text=element_text(family="Tahoma"),
        axis.text.x=element_text(colour="black", size = 9),
        axis.text.y=element_text(colour="black", size = 9)) +
  geom_vline(xintercept = ##fill-in, size = 1, colour = "#FF3721",
               linetype = "dashed")
meanDistributionTreatmentA

## Mean viability distribution graph: treatment condition, Aprime group 
## Mean for treatment condition, Aprime group: (fill-in)
meanDistributionTreatmentAp <- ggplot(treatmentApStats, aes(x = sum)) +
  geom_histogram(aes(y = ..count..), binwidth = 5,
                 colour = barlines, fill = barfill) +
  scale_x_continuous(name = "Median viability sum",
                     breaks = seq(0, 100, 20),
                     limits=c(0, 70)) +
  scale_y_continuous(name = "count") +
  ggtitle("Frequency of sum of viability scores:N=##fill-in") +
  theme_bw() +
  theme(axis.line = element_line(size=1, colour = "black"),
        panel.grid.major = element_line(colour = "#d3d3d3"),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(), panel.background = element_blank(),
        plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
        text=element_text(family="Tahoma"),
        axis.text.x=element_text(colour="black", size = 9),
        axis.text.y=element_text(colour="black", size = 9)) +
  geom_vline(xintercept = ##fill-in, size = 1, colour = "#FF3721",
               linetype = "dashed")
meanDistributionTreatmentAp

## Mean viability distribution graph: treatment condition, B group 
## Mean for treatment condition, B group: 
barfill <- "#4271AE"
barlines <- "#1F3552"
meanDistributionTreatmentB <- ggplot(treatmentBStats, aes(x = sum)) +
  geom_histogram(aes(y = ..count..), binwidth = 5,
                 colour = barlines, fill = barfill) +
  scale_x_continuous(name = "Median viability sum \n across all teams in masked round",
                     breaks = seq(0, 98, 14),
                     limits=c(7, 70)) +
  scale_y_continuous(name = "count") +
  ggtitle("Frequency of sum of viability scores: masked condition, N=##fill-in") +
  theme_bw() +
  theme(axis.line = element_line(size=1, colour = "black"),
        panel.grid.major = element_line(colour = "#d3d3d3"),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(), panel.background = element_blank(),
        plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
        text=element_text(family="Tahoma"),
        axis.text.x=element_text(colour="black", size = 9),
        axis.text.y=element_text(colour="black", size = 9)) +
  geom_vline(xintercept = ##fill-in, size = 1, colour = "#FF3721",
               linetype = "dashed") 
meanDistributionTreatmentB
```

## Chat data: 
```{r}
chatFiles = lapply(completeBatches, function(batch){
    chatFile = read_json(paste(dataPath,batch,"chats.json",sep="/"), simplifyVector = TRUE)
    return(flatten(chatFile, recursive = TRUE))
  })

allChatFiles <- plyr::ldply(chatFiles, data.frame)
chatFreq <- allChatFiles  %>%
  group_by(round, room, batch) %>% 
  dplyr::summarise(n=n()) %>% 
  filter(n>1, round<=2)

## Filter on round <=2, because round 3 in all but one case is only includes "X has left chat room" 

# Histogram grouped by round: 
g <- ggplot(chatFreq, aes(n)) + scale_fill_brewer(palette = "Spectral")
g + geom_histogram(aes(fill=factor(round)), 
                   bins=5, 
                   col="black", 
                   size=.1) +   # change number of bins
  labs(title="Histogram of the number of chat lines per team (includes teams with n=1)", 
       fill = "Round") + 
  xlab(label="Number of lines from chat data per team") + 
  ylab(label="Count") 
stats$room <- unlist(stats$room) 
chatFreqStats <- right_join(x=chatFreq, y=stats)
```

## Conditional probabilities: 
```{r}
groupedProportionFractureTreatment %>% 
  group_by(condition) %>% 
  summarise(n=n(), count=count(fracture)) 

# P(F3|F1) 
p1 <-  19 / 34

# P(notF3|F1) 
p2 <-  15 / 34

# P(F3|F1)
p3 <- 24 / 34

## P(notF3|F1)
p4 <- 10 / 34

  # Print out the probabilities
round(c(p1, p2, p3, p4), digits = 2)  


```



